# -*- coding: utf-8 -*-
"""IMDB.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1qFIQNWhlY_mvKhYdw4jF-CgUTH9OXlR_
"""

from google.colab import drive
drive.mount('/content/drive')

!pip install pyprind

import pyprind
import sys
import os
import pandas as pd
# change the 'basepath' to the directory of the
# unzipped movie dataset
# Updated the basepath to the correct extraction location
basepath = '/content/drive/MyDrive/aclImdb'
labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000, stream=sys.stdout)

# Create an empty list to store the data
data = []

for s in ('test', 'train'):
  for l in ('pos', 'neg'):
    path = os.path.join(basepath, s, l)
    # Check if the directory exists before listing files
    if os.path.exists(path):
        for file in sorted(os.listdir(path)):
          file_path = os.path.join(path, file)
          # Check if the path is a file before opening
          if os.path.isfile(file_path):
              with open(file_path, 'r', encoding='utf-8') as infile:
                txt = infile.read()
              # Append the data as a list to the list of data
              data.append([txt, labels[l]])
              pbar.update()

# Create the DataFrame from the collected data after the loop
df = pd.DataFrame(data, columns=['review', 'sentiment'])

# The column names were set during DataFrame creation, so this line is redundant
# df.columns = ['review', 'sentiment']

import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

df = pd.read_csv('movie_data.csv', encoding='utf-8')
# the following column renaming is necessary on some computers:
df = df.rename(columns={"0": "review", "1": "sentiment"})
df.head(3)

df.shape

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining','The weather is sweet','The sun is shining, the weather is sweet,and one and one is two'])
bag = count.fit_transform(docs)

print(count.vocabulary_)

print(bag.toarray())

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

df.loc[0, 'review'][-50:]

import re
def preprocessor(text):
  text = re.sub('<[^>]*>', '', text)
  emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
  text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
  return text

preprocessor(df.loc[0, 'review'][-50:])
preprocessor("</a>This :) is :( a test :-)!")

df['review'] = df['review'].apply(preprocessor)

def tokenizer(text):
  return text.split()
tokenizer('runners like running and thus they run')

from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

def tokenizer_porter(text):
  return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes'
  ' running and runs a lot')
  if w not in stop]

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(strip_accents=None,
  lowercase=False,
  preprocessor=None)
small_param_grid = [
  {
    'vect__ngram_range': [(1, 1)],
    'vect__stop_words': [None],
    'vect__tokenizer': [tokenizer, tokenizer_porter],
    'clf__penalty': ['l2'],
    'clf__C': [1.0, 10.0]
  },
  {
    'vect__ngram_range': [(1, 1)],
    'vect__stop_words': [stop, None],
    'vect__tokenizer': [tokenizer],
    'vect__use_idf':[False],
    'vect__norm':[None],
    'clf__penalty': ['l2'],
    'clf__C': [1.0, 10.0]
  },
]
lr_tfidf = Pipeline([
  ('vect', tfidf),
  ('clf', LogisticRegression(solver='liblinear'))
])
gs_lr_tfidf = GridSearchCV(lr_tfidf, small_param_grid,
  scoring='accuracy', cv=5,
  verbose=2, n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)

print(f'Best parameter set: {gs_lr_tfidf.best_params_}')

print(f'CV Accuracy: {gs_lr_tfidf.best_score_:.3f}')
clf = gs_lr_tfidf.best_estimator_
print(f'Test Accuracy: {clf.score(X_test, y_test):.3f}')